In [ ]:
import pickle;
from imp import reload
import numpy as np; import pandas as pd
import lightgbm as lgb; import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import constants, utils, inference, evaluation
from joblib import Parallel, delayed
import multiprocessing
import os
pd.options.mode.chained_assignment = None

In [2]:
def generate_pred(m, idx, is_sub=False):
    '''
        m: model path
        idx: index of model
        is_sub: bool indicator for submission
    '''
    print('Evaluating Model {} ...'.format(idx))
    print('Model Path {}'.format(m))
    bst = pickle.load(open(m, 'rb'))
    feat = data[utils.get_feat_col(bst)] # data global variable
    pred = utils.get_predition(bst, feat)

    user_product = gid[['user_id', 'product_id', 'order_id']] # gid global variable
    user_product['score'] = pred

    if is_sub is False:
        user_product['label'] = label
        auc = roc_auc_score(label, pred) # label global variable
        print('Evaluation AUC {}'.format(auc))
        op = user_product.copy()
        op = utils.tarbox_f1_optim(op, low_bound=0)
        op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])
        op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
                    op, on = ['order_id'], how = 'left')
        gold = evaluation.get_gold(user_product) 
        res = evaluation.evaluation(gold, op[['order_id', 'products']])
        mf1 = res.f1score.mean()
        print('F1 Optimization Result: mean-f1-score {}'.format(mf1))
        eval_res=  {'model_file':m.split('/')[-1], 'eval_auc': auc, 'eval_mf1': mf1}
        return eval_res, pred
    else:
        return pred

Load Submission Data


In [5]:
data = pd.read_hdf('/data/Instacart/test.h5')

In [6]:
orders = data[['order_id']].drop_duplicates()
up_pair = data[['order_id', 'product_id']].drop_duplicates()
gid = data[constants.ID_COLS]

In [7]:
bagging_tree = pd.read_hdf(constants.EVA_DATA_DIR + 'bagging_tree.h5')
bagging_tree = bagging_tree.sort_values('eval_mf1')

In [8]:
bagging_tree.reset_index()


Out[8]:
index eval_auc eval_mf1 model_file
0 1 0.837777 0.402564 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
1 0 0.838028 0.402665 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
2 3 0.838212 0.402834 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
3 2 0.837845 0.402956 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
4 1 0.839410 0.402959 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
5 0 0.839316 0.403175 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
6 15 0.839354 0.403207 /home/public/Instacart/lgb/lgb_gbdt_0.83976702...
7 0 0.838976 0.403230 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
8 16 0.839337 0.403314 /home/public/Instacart/lgb/lgb_gbdt_0.83975988...
9 22 0.839243 0.403543 /home/public/Instacart/lgb/lgb_gbdt_0.84022029...
10 1 0.839331 0.403580 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
11 0 0.839467 0.403595 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
12 0 0.839425 0.403769 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
13 1 0.839287 0.403786 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
14 0 0.839904 0.404341 /home/public/Instacart/xgb/xgb_vip_0.839904_0....
15 0 0.840101 0.404459 /home/public/Instacart/xgb/xgb_vip_gbtree_0.84...

In [9]:
# 11mins
pred_subs = []
for idx,m in enumerate(bagging_tree.model_file.values):
    fp = './submission/' + m.split('/')[-1] + 'pkl'
    if os.path.exists(fp):
        pred = pickle.load(open(fp, 'rb'))
    else:
        pred = generate_pred(m, idx, is_sub=True)
        with open('./submission/' + m.split('/')[-1] + 'pkl', 'wb') as f:
            pickle.dump(pred, f, pickle.HIGHEST_PROTOCOL)
    pred_subs.append(pred)

In [29]:
level0 = np.median(pred_subs[0:5], axis=0) # 0.4034
level1 = np.median([level0] + pred_subs[5:9], axis=0) # 0.40429
level2 = np.median([level1] + pred_subs[9:14], axis=0)

Bagging: Median


In [13]:
aboretum_pred = pd.merge(gid[['order_id', 'product_id']], 
                         pd.read_csv('./submission/prediction_arboretum.csv'),
                         on=['order_id', 'product_id'], how='left')

In [14]:
big_model_pred = pd.merge(gid[['order_id', 'product_id']],
                          pd.read_csv('./submission/xgb_score_0.8406880155364034.csv'),
                          on=['order_id', 'product_id'], how='left')
big_model_pred.score.fillna(0, inplace = True)

In [25]:
big_lgb_dart = pd.read_csv('./submission/big_lgb_dart_0.8386003614599506.csv')

In [34]:
predictions = [level2, pred_subs[14], pred_subs[15],
               aboretum_pred.prediction.values, 
               big_model_pred.score.values]

In [35]:
user_product = data[['user_id', 'product_id', 'order_id']]
user_product['score'] = np.median(predictions, axis=0)

In [ ]:
op = user_product.copy()
op = utils.shing_f1_optim(op, low_bound=0.01, topk=200)
op = pd.merge(op[['order_id', 'products']], orders[['order_id']], on=['order_id'], how='right')
op.columns = ['order_id', 'products']
for row in op.loc[op.products.isnull(), 'products'].index:
    op.at[row, 'products'] = 'None'
op[['order_id', 'products']].to_csv('./submission/lgb3_big_bag_shing.csv', index=False)

In [37]:
op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op = pd.merge(op[['order_id', 'products']], orders[['order_id']], on=['order_id'], how='right')
op.columns = ['order_id', 'products']
for row in op.loc[op.products.isnull(), 'products'].index:
    op.at[row, 'products'] = 'None'
op[['order_id', 'products']].to_csv('./submission/final_bag1.csv', index=False)


CPU times: user 1min 27s, sys: 7.16 s, total: 1min 34s
Wall time: 1min 47s

In [ ]: